In [29]:
# to import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier
In [3]:
#load dataset
df = pd.read_csv('c:\\Users\\HP\\Downloads\\DS-ML\Churn_Modelling.csv')
df.head()
Out[3]:
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Churn
0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 3 15619304 Onio 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 4 15701354 Boni 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
In [4]:
# getting information about dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Churn            10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
In [5]:
# more info on rows and collumns
df.shape
Out[5]:
(10000, 14)
In [6]:
#information about missing data
round((df.isnull().sum() / df.shape[0]) * 100, 2)
Out[6]:
RowNumber          0.0
CustomerId         0.0
Surname            0.0
CreditScore        0.0
Geography          0.0
Gender             0.0
Age                0.0
Tenure             0.0
Balance            0.0
NumOfProducts      0.0
HasCrCard          0.0
IsActiveMember     0.0
EstimatedSalary    0.0
Churn              0.0
dtype: float64
In [7]:
# identify collumn data types
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns:', cat_col)
print('Numerical columns:', num_col)
Categorical columns: ['Surname', 'Geography', 'Gender']
Numerical columns: ['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Churn']
In [8]:
#count unique data for categorical data types
df[cat_col].nunique()
Out[8]:
Surname      2932
Geography       3
Gender          2
dtype: int64
In [9]:
#statistical summary of dataset
df.describe()
Out[9]:
RowNumber CustomerId CreditScore Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Churn
count 10000.00000 1.000000e+04 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.00000 10000.000000 10000.000000 10000.000000
mean 5000.50000 1.569094e+07 650.528800 38.921800 5.012800 76485.889288 1.530200 0.70550 0.515100 100090.239881 0.203700
std 2886.89568 7.193619e+04 96.653299 10.487806 2.892174 62397.405202 0.581654 0.45584 0.499797 57510.492818 0.402769
min 1.00000 1.556570e+07 350.000000 18.000000 0.000000 0.000000 1.000000 0.00000 0.000000 11.580000 0.000000
25% 2500.75000 1.562853e+07 584.000000 32.000000 3.000000 0.000000 1.000000 0.00000 0.000000 51002.110000 0.000000
50% 5000.50000 1.569074e+07 652.000000 37.000000 5.000000 97198.540000 1.000000 1.00000 1.000000 100193.915000 0.000000
75% 7500.25000 1.575323e+07 718.000000 44.000000 7.000000 127644.240000 2.000000 1.00000 1.000000 149388.247500 0.000000
max 10000.00000 1.581569e+07 850.000000 92.000000 10.000000 250898.090000 4.000000 1.00000 1.000000 199992.480000 1.000000
In [10]:
#correlation analysis
df.corr(numeric_only=True)
Out[10]:
RowNumber CustomerId CreditScore Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Churn
RowNumber 1.000000 0.004202 0.005840 0.000783 -0.006495 -0.009067 0.007246 0.000599 0.012044 -0.005988 -0.016571
CustomerId 0.004202 1.000000 0.005308 0.009497 -0.014883 -0.012419 0.016972 -0.014025 0.001665 0.015271 -0.006248
CreditScore 0.005840 0.005308 1.000000 -0.003965 0.000842 0.006268 0.012238 -0.005458 0.025651 -0.001384 -0.027094
Age 0.000783 0.009497 -0.003965 1.000000 -0.009997 0.028308 -0.030680 -0.011721 0.085472 -0.007201 0.285323
Tenure -0.006495 -0.014883 0.000842 -0.009997 1.000000 -0.012254 0.013444 0.022583 -0.028362 0.007784 -0.014001
Balance -0.009067 -0.012419 0.006268 0.028308 -0.012254 1.000000 -0.304180 -0.014858 -0.010084 0.012797 0.118533
NumOfProducts 0.007246 0.016972 0.012238 -0.030680 0.013444 -0.304180 1.000000 0.003183 0.009612 0.014204 -0.047820
HasCrCard 0.000599 -0.014025 -0.005458 -0.011721 0.022583 -0.014858 0.003183 1.000000 -0.011866 -0.009933 -0.007138
IsActiveMember 0.012044 0.001665 0.025651 0.085472 -0.028362 -0.010084 0.009612 -0.011866 1.000000 -0.011421 -0.156128
EstimatedSalary -0.005988 0.015271 -0.001384 -0.007201 0.007784 0.012797 0.014204 -0.009933 -0.011421 1.000000 0.012097
Churn -0.016571 -0.006248 -0.027094 0.285323 -0.014001 0.118533 -0.047820 -0.007138 -0.156128 0.012097 1.000000
In [11]:
sns.countplot(x='Churn', data=df, hue='Churn', palette='Set1')
plt.title('Churn Modelling')
plt.show()
No description has been provided for this image
In [12]:
# generate profiling report 
from ydata_profiling import ProfileReport
df.profile_report()
Upgrade to ydata-sdk

Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 14/14 [00:00<00:00, 36.68it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[12]:

In [13]:
plt.boxplot(df['Age'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Age')
plt.title('Box Plot')
plt.show()
No description has been provided for this image
In [14]:
X = df[['CustomerId','Surname','CreditScore', 'Geography','Gender','Age','Tenure', 'Balance','NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']]
Y = df['Churn']
In [24]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

scaler = MinMaxScaler(feature_range=(0, 1))

x1 = X
num_cols = x1.select_dtypes(include=['int64', 'float64']).columns
x1[num_cols] = scaler.fit_transform(x1[num_cols])

# encoding categorical features
cat_cols = x1.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    x1[cat_cols] = le.fit_transform(x1[cat_cols])
x1.head()
C:\Users\HP\AppData\Local\Temp\ipykernel_8788\4292956829.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x1[num_cols] = scaler.fit_transform(x1[num_cols])
Out[24]:
CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary
0 0.275616 0.380416 0.538 0.0 0.0 0.324324 0.2 0.000000 0.000000 1.0 1.0 0.506735
1 0.326454 0.401569 0.516 1.0 0.0 0.310811 0.1 0.334031 0.000000 0.0 1.0 0.562709
2 0.214421 0.696008 0.304 0.0 0.0 0.324324 0.8 0.636357 0.666667 1.0 0.0 0.569654
3 0.542636 0.098601 0.698 0.0 0.0 0.283784 0.1 0.000000 0.333333 0.0 0.0 0.469120
4 0.688778 0.621631 1.000 1.0 0.0 0.337838 0.2 0.500246 0.000000 1.0 1.0 0.395400
In [25]:
# standardizing numerical features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(x1)
print(X_standardized[:5])
[[-0.78321342 -0.46418322 -0.32622142 -0.90188624 -1.09598752  0.29351742
  -1.04175968 -1.22584767 -0.91158349  0.64609167  0.97024255  0.02188649]
 [-0.60653412 -0.3909112  -0.44003595  1.51506738 -1.09598752  0.19816383
  -1.38753759  0.11735002 -0.91158349 -1.54776799  0.97024255  0.21653375]
 [-0.99588476  0.62898807 -1.53679418 -0.90188624 -1.09598752  0.29351742
   1.03290776  1.33305335  2.52705662  0.64609167 -1.03067011  0.2406869 ]
 [ 0.14476652 -1.44035563  0.50152063 -0.90188624 -1.09598752  0.00745665
  -1.38753759 -1.22584767  0.80773656 -1.54776799 -1.03067011 -0.10891792]
 [ 0.65265871  0.37135419  2.06388377  1.51506738 -1.09598752  0.38887101
  -1.04175968  0.7857279  -0.91158349  0.64609167  0.97024255 -0.36527578]]
In [26]:
# preparing for train-test split
categorical_features_X = X.select_dtypes(include=['object']).columns.tolist()
numerical_features_X = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features_X),
        ("num", StandardScaler(), numerical_features_X)
    ]
)
In [27]:
# train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
In [30]:
# building random forest model
rf_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        class_weight='balanced'  # handle imbalanced classes
    ))
])
# Fit
rf_model.fit(X_train, Y_train)
Out[30]:
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  []),
                                                 ('num', StandardScaler(),
                                                  ['CustomerId', 'Surname',
                                                   'CreditScore', 'Geography',
                                                   'Gender', 'Age', 'Tenure',
                                                   'Balance', 'NumOfProducts',
                                                   'HasCrCard',
                                                   'IsActiveMember',
                                                   'EstimatedSalary'])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=300, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  []),
                                                 ('num', StandardScaler(),
                                                  ['CustomerId', 'Surname',
                                                   'CreditScore', 'Geography',
                                                   'Gender', 'Age', 'Tenure',
                                                   'Balance', 'NumOfProducts',
                                                   'HasCrCard',
                                                   'IsActiveMember',
                                                   'EstimatedSalary'])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=300, random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 []),
                                ('num', StandardScaler(),
                                 ['CustomerId', 'Surname', 'CreditScore',
                                  'Geography', 'Gender', 'Age', 'Tenure',
                                  'Balance', 'NumOfProducts', 'HasCrCard',
                                  'IsActiveMember', 'EstimatedSalary'])])
[]
OneHotEncoder(handle_unknown='ignore')
['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
StandardScaler()
RandomForestClassifier(class_weight='balanced', n_estimators=300,
                       random_state=42)
In [31]:
# accuracy and classification report
rf_model.fit(X_train, Y_train)

rf_preds = rf_model.predict(X_test)

print("\n=== RANDOM FOREST RESULTS ===")
print("Accuracy:", accuracy_score(Y_test, rf_preds))
print(classification_report(Y_test, rf_preds))
=== RANDOM FOREST RESULTS ===
Accuracy: 0.8664
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      2003
           1       0.81      0.42      0.56       497

    accuracy                           0.87      2500
   macro avg       0.84      0.70      0.74      2500
weighted avg       0.86      0.87      0.85      2500